This analysis evaluates Meridian Health Systems’ “Thrive at Work” wellness program using machine learning techniques to:
# Data manipulation
library(tidyverse)
library(data.table)
# Machine Learning
library(caret)
library(randomForest)
library(e1071)
library(cluster)
# Visualization
library(ggplot2)
library(GGally)
library(corrplot)
library(gridExtra)
library(factoextra)
library(patchwork)
# Model evaluation
library(pROC)
library(ROCR)
# Set seed for reproducibility
set.seed(42)
# Load the data
wellness_data <- read.csv("data/workplace_wellness_data.csv", stringsAsFactors = FALSE)
# Display structure
str(wellness_data)
## 'data.frame': 1000 obs. of 18 variables:
## $ employee_id : chr "EMP_0001" "EMP_0002" "EMP_0003" "EMP_0004" ...
## $ age : int 37 28 45 41 56 44 33 26 48 38 ...
## $ gender : chr "Male" "Male" "Female" "Female" ...
## $ department : chr "Sales" "Operations" "Administration" "Sales" ...
## $ years_employed : num 8 9.4 7.1 3.4 0.5 7 2.2 3.4 0.5 6.6 ...
## $ bmi : num 21.4 31 24.6 34.1 27.4 18 25.9 31.8 23.2 30 ...
## $ bp_systolic : int 123 148 149 164 141 122 137 129 132 151 ...
## $ resting_hr : int 67 70 58 70 88 63 75 65 66 80 ...
## $ baseline_activity_hrs: num 1.3 0.8 0.4 1.9 1.6 5.6 0.9 0.4 0.6 0.8 ...
## $ sleep_quality : num 2.5 3.2 8.8 4 6 7.7 6 4.3 6 5.7 ...
## $ stress_score : num 9.8 8 6.5 7.9 7 3.1 4.4 8.9 6.7 4.8 ...
## $ job_satisfaction : num 6 6.8 4.6 3.6 6.5 10 6.2 4.3 6.6 6.5 ...
## $ social_support : num 8.5 9.6 9.9 7.4 5.9 6.9 7.7 6 8.6 4.7 ...
## $ self_efficacy : num 6.3 6.2 4.4 6.6 7.9 6.1 5.2 8.8 6.2 5.8 ...
## $ sessions_attended : int 7 5 7 5 8 11 6 11 6 6 ...
## $ app_engagement : num 27.4 25.6 28.5 69.6 55.8 39.7 46.9 89.3 72 38.9 ...
## $ peer_challenge : int 0 0 0 0 0 0 1 0 1 1 ...
## $ program_success : int 1 0 1 1 0 1 1 0 1 1 ...
# Display first few rows
head(wellness_data)
## employee_id age gender department years_employed bmi bp_systolic
## 1 EMP_0001 37 Male Sales 8.0 21.4 123
## 2 EMP_0002 28 Male Operations 9.4 31.0 148
## 3 EMP_0003 45 Female Administration 7.1 24.6 149
## 4 EMP_0004 41 Female Sales 3.4 34.1 164
## 5 EMP_0005 56 Female Manufacturing 0.5 27.4 141
## 6 EMP_0006 44 Male IT 7.0 18.0 122
## resting_hr baseline_activity_hrs sleep_quality stress_score job_satisfaction
## 1 67 1.3 2.5 9.8 6.0
## 2 70 0.8 3.2 8.0 6.8
## 3 58 0.4 8.8 6.5 4.6
## 4 70 1.9 4.0 7.9 3.6
## 5 88 1.6 6.0 7.0 6.5
## 6 63 5.6 7.7 3.1 10.0
## social_support self_efficacy sessions_attended app_engagement peer_challenge
## 1 8.5 6.3 7 27.4 0
## 2 9.6 6.2 5 25.6 0
## 3 9.9 4.4 7 28.5 0
## 4 7.4 6.6 5 69.6 0
## 5 5.9 7.9 8 55.8 0
## 6 6.9 6.1 11 39.7 0
## program_success
## 1 1
## 2 0
## 3 1
## 4 1
## 5 0
## 6 1
# Summary statistics
summary(wellness_data)
## employee_id age gender department
## Length:1000 Min. :22.00 Length:1000 Length:1000
## Class :character 1st Qu.:35.75 Class :character Class :character
## Mode :character Median :42.00 Mode :character Mode :character
## Mean :42.28
## 3rd Qu.:49.00
## Max. :65.00
## years_employed bmi bp_systolic resting_hr
## Min. : 0.500 Min. :18.00 Min. :113.0 Min. : 50.00
## 1st Qu.: 1.600 1st Qu.:24.70 1st Qu.:137.0 1st Qu.: 64.00
## Median : 3.700 Median :28.00 Median :143.0 Median : 70.00
## Mean : 5.287 Mean :28.18 Mean :143.6 Mean : 69.92
## 3rd Qu.: 7.500 3rd Qu.:31.60 3rd Qu.:151.0 3rd Qu.: 75.00
## Max. :30.000 Max. :45.00 Max. :173.0 Max. :100.00
## baseline_activity_hrs sleep_quality stress_score job_satisfaction
## Min. : 0.000 Min. : 2.000 Min. : 1.000 Min. : 1.700
## 1st Qu.: 0.600 1st Qu.: 4.800 1st Qu.: 4.100 1st Qu.: 5.300
## Median : 1.500 Median : 5.900 Median : 5.700 Median : 6.300
## Mean : 2.375 Mean : 5.940 Mean : 5.708 Mean : 6.377
## 3rd Qu.: 3.300 3rd Qu.: 7.025 3rd Qu.: 7.200 3rd Qu.: 7.500
## Max. :15.000 Max. :10.000 Max. :10.000 Max. :10.000
## social_support self_efficacy sessions_attended app_engagement
## Min. : 1.000 Min. : 1.500 Min. : 2.000 Min. : 0.50
## 1st Qu.: 4.800 1st Qu.: 5.100 1st Qu.: 6.000 1st Qu.:24.30
## Median : 6.100 Median : 6.100 Median : 7.000 Median :39.40
## Mean : 6.055 Mean : 6.162 Mean : 7.336 Mean :41.99
## 3rd Qu.: 7.400 3rd Qu.: 7.200 3rd Qu.: 9.000 3rd Qu.:58.70
## Max. :10.000 Max. :10.000 Max. :12.000 Max. :97.90
## peer_challenge program_success
## Min. :0.000 Min. :0.000
## 1st Qu.:0.000 1st Qu.:0.000
## Median :0.000 Median :1.000
## Mean :0.391 Mean :0.723
## 3rd Qu.:1.000 3rd Qu.:1.000
## Max. :1.000 Max. :1.000
# Convert categorical variables to factors
wellness_data$gender <- as.factor(wellness_data$gender)
wellness_data$department <- as.factor(wellness_data$department)
wellness_data$peer_challenge <- as.factor(wellness_data$peer_challenge)
wellness_data$program_success <- as.factor(wellness_data$program_success)
# Check for missing values
cat("Missing values per column:\n")
## Missing values per column:
colSums(is.na(wellness_data))
## employee_id age gender
## 0 0 0
## department years_employed bmi
## 0 0 0
## bp_systolic resting_hr baseline_activity_hrs
## 0 0 0
## sleep_quality stress_score job_satisfaction
## 0 0 0
## social_support self_efficacy sessions_attended
## 0 0 0
## app_engagement peer_challenge program_success
## 0 0 0
# Check class distribution
cat("\n\nProgram Success Distribution:\n")
##
##
## Program Success Distribution:
table(wellness_data$program_success)
##
## 0 1
## 277 723
prop.table(table(wellness_data$program_success))
##
## 0 1
## 0.277 0.723
# Program success distribution
p1 <- ggplot(wellness_data, aes(x = program_success, fill = program_success)) +
geom_bar(stat = "count", width = 0.6) +
geom_text(stat = 'count', aes(label = after_stat(count)), vjust = -0.5, size = 5) +
scale_fill_manual(values = c("0" = "#E74C3C", "1" = "#27AE60")) +
labs(title = "Program Success Distribution",
x = "Program Success",
y = "Count") +
theme_minimal() +
theme(legend.position = "none",
plot.title = element_text(hjust = 0.5, size = 16, face = "bold"))
print(p1)
# Age distribution by success
p2 <- ggplot(wellness_data, aes(x = program_success, y = age, fill = program_success)) +
geom_boxplot(alpha = 0.7) +
scale_fill_manual(values = c("0" = "#E74C3C", "1" = "#27AE60")) +
labs(title = "Age Distribution by Program Success",
x = "Program Success", y = "Age (years)") +
theme_minimal() +
theme(legend.position = "none")
# Gender distribution
p3 <- ggplot(wellness_data, aes(x = gender, fill = program_success)) +
geom_bar(position = "fill") +
scale_fill_manual(values = c("0" = "#E74C3C", "1" = "#27AE60")) +
labs(title = "Program Success by Gender",
x = "Gender", y = "Proportion") +
theme_minimal()
# Department distribution
p4 <- ggplot(wellness_data, aes(x = department, fill = program_success)) +
geom_bar(position = "fill") +
scale_fill_manual(values = c("0" = "#E74C3C", "1" = "#27AE60")) +
labs(title = "Program Success by Department",
x = "Department", y = "Proportion") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Years employed
p5 <- ggplot(wellness_data, aes(x = program_success, y = years_employed, fill = program_success)) +
geom_boxplot(alpha = 0.7) +
scale_fill_manual(values = c("0" = "#E74C3C", "1" = "#27AE60")) +
labs(title = "Years Employed by Program Success",
x = "Program Success", y = "Years Employed") +
theme_minimal() +
theme(legend.position = "none")
# Combine plots
(p2 | p3) / (p4 | p5)
# BMI
p6 <- ggplot(wellness_data, aes(x = program_success, y = bmi, fill = program_success)) +
geom_boxplot(alpha = 0.7) +
scale_fill_manual(values = c("0" = "#E74C3C", "1" = "#27AE60")) +
labs(title = "BMI by Program Success", x = "Program Success", y = "BMI") +
theme_minimal() +
theme(legend.position = "none")
# Blood Pressure
p7 <- ggplot(wellness_data, aes(x = program_success, y = bp_systolic, fill = program_success)) +
geom_boxplot(alpha = 0.7) +
scale_fill_manual(values = c("0" = "#E74C3C", "1" = "#27AE60")) +
labs(title = "Systolic BP by Program Success", x = "Program Success", y = "BP (mmHg)") +
theme_minimal() +
theme(legend.position = "none")
# Resting HR
p8 <- ggplot(wellness_data, aes(x = program_success, y = resting_hr, fill = program_success)) +
geom_boxplot(alpha = 0.7) +
scale_fill_manual(values = c("0" = "#E74C3C", "1" = "#27AE60")) +
labs(title = "Resting HR by Program Success", x = "Program Success", y = "HR (bpm)") +
theme_minimal() +
theme(legend.position = "none")
# Baseline Activity
p9 <- ggplot(wellness_data, aes(x = program_success, y = baseline_activity_hrs, fill = program_success)) +
geom_boxplot(alpha = 0.7) +
scale_fill_manual(values = c("0" = "#E74C3C", "1" = "#27AE60")) +
labs(title = "Baseline Activity by Program Success",
x = "Program Success", y = "Activity (hrs/week)") +
theme_minimal() +
theme(legend.position = "none")
(p6 | p7) / (p8 | p9)
# Sessions Attended
p16 <- ggplot(wellness_data, aes(x = program_success, y = sessions_attended, fill = program_success)) +
geom_boxplot(alpha = 0.7) +
scale_fill_manual(values = c("0" = "#E74C3C", "1" = "#27AE60")) +
labs(title = "Sessions Attended by Program Success",
x = "Program Success", y = "Sessions (out of 12)") +
theme_minimal() +
theme(legend.position = "none")
# Peer Challenge Participation
p17 <- ggplot(wellness_data, aes(x = peer_challenge, fill = program_success)) +
geom_bar(position = "fill") +
scale_fill_manual(values = c("0" = "#E74C3C", "1" = "#27AE60")) +
labs(title = "Program Success by Peer Challenge Participation",
x = "Peer Challenge (0=No, 1=Yes)", y = "Proportion") +
theme_minimal()
p16 | p17
# Select numeric variables for correlation
numeric_vars <- wellness_data %>%
select(age, years_employed, bmi, bp_systolic, resting_hr,
baseline_activity_hrs, sleep_quality, stress_score,
job_satisfaction, social_support, self_efficacy,
sessions_attended, app_engagement)
# Calculate correlation matrix
cor_matrix <- cor(numeric_vars, use = "complete.obs")
# Plot correlation matrix
corrplot(cor_matrix, method = "color", type = "upper",
tl.col = "black", tl.srt = 45,
addCoef.col = "black", number.cex = 0.7,
col = colorRampPalette(c("#E74C3C", "white", "#27AE60"))(200),
title = "Correlation Matrix of Numeric Variables",
mar = c(0,0,2,0))
# Prepare features (remove employee_id and target)
features <- wellness_data %>%
select(-employee_id, -program_success)
# Target variable
target <- wellness_data$program_success
# Split data into training (70%) and testing (30%)
set.seed(42)
train_index <- createDataPartition(target, p = 0.7, list = FALSE)
X_train <- features[train_index, ]
X_test <- features[-train_index, ]
y_train <- target[train_index]
y_test <- target[-train_index]
cat("Training set size:", nrow(X_train), "\n")
## Training set size: 701
cat("Test set size:", nrow(X_test), "\n")
## Test set size: 299
cat("\nTraining set class distribution:\n")
##
## Training set class distribution:
print(table(y_train))
## y_train
## 0 1
## 194 507
cat("\nTest set class distribution:\n")
##
## Test set class distribution:
print(table(y_test))
## y_test
## 0 1
## 83 216
# Train Random Forest
cat("Training Random Forest...\n")
## Training Random Forest...
rf_model <- randomForest(x = X_train,
y = y_train,
ntree = 500,
mtry = sqrt(ncol(X_train)),
importance = TRUE,
nodesize = 5)
# Make predictions
rf_pred <- predict(rf_model, X_test)
rf_pred_prob <- predict(rf_model, X_test, type = "prob")[, 2]
# Calculate metrics
rf_cm <- confusionMatrix(rf_pred, y_test, positive = "1")
# Print results
print(rf_cm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 39 11
## 1 44 205
##
## Accuracy : 0.8161
## 95% CI : (0.7674, 0.8583)
## No Information Rate : 0.7224
## P-Value [Acc > NIR] : 0.0001127
##
## Kappa : 0.4774
##
## Mcnemar's Test P-Value : 1.597e-05
##
## Sensitivity : 0.9491
## Specificity : 0.4699
## Pos Pred Value : 0.8233
## Neg Pred Value : 0.7800
## Prevalence : 0.7224
## Detection Rate : 0.6856
## Detection Prevalence : 0.8328
## Balanced Accuracy : 0.7095
##
## 'Positive' Class : 1
##
# Calculate additional metrics
rf_accuracy <- rf_cm$overall['Accuracy']
rf_precision <- rf_cm$byClass['Precision']
rf_recall <- rf_cm$byClass['Recall']
rf_f1 <- rf_cm$byClass['F1']
# ROC and AUC
rf_roc <- roc(y_test, rf_pred_prob)
rf_auc <- auc(rf_roc)
cat("\n=== Random Forest Performance ===\n")
##
## === Random Forest Performance ===
cat(sprintf("Accuracy: %.4f\n", rf_accuracy))
## Accuracy: 0.8161
cat(sprintf("Precision: %.4f\n", rf_precision))
## Precision: 0.8233
cat(sprintf("Recall: %.4f\n", rf_recall))
## Recall: 0.9491
cat(sprintf("F1-Score: %.4f\n", rf_f1))
## F1-Score: 0.8817
cat(sprintf("AUC-ROC: %.4f\n", rf_auc))
## AUC-ROC: 0.8217
# Extract feature importance
importance_df <- data.frame(
Feature = rownames(importance(rf_model)),
MeanDecreaseGini = importance(rf_model)[, "MeanDecreaseGini"],
MeanDecreaseAccuracy = importance(rf_model)[, "MeanDecreaseAccuracy"]
) %>%
arrange(desc(MeanDecreaseGini))
# Top 10 features by Gini
top_features <- head(importance_df, 10)
cat("\nTop 10 Most Important Features:\n")
##
## Top 10 Most Important Features:
print(top_features)
## Feature MeanDecreaseGini MeanDecreaseAccuracy
## age age 67.14844 44.0314295
## app_engagement app_engagement 18.85573 7.6078735
## social_support social_support 17.14399 6.6338596
## bmi bmi 16.24517 -0.2925174
## sleep_quality sleep_quality 14.46832 4.1569955
## self_efficacy self_efficacy 13.58756 1.3320646
## department department 12.69344 3.9301796
## stress_score stress_score 12.16555 1.0689508
## job_satisfaction job_satisfaction 12.15157 0.2062854
## years_employed years_employed 11.51547 -1.1243991
# Visualize feature importance
p_imp1 <- ggplot(top_features, aes(x = reorder(Feature, MeanDecreaseGini),
y = MeanDecreaseGini)) +
geom_bar(stat = "identity", fill = "#3498DB", alpha = 0.8) +
coord_flip() +
labs(title = "Feature Importance (Mean Decrease Gini)",
x = "Feature", y = "Mean Decrease Gini") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))
p_imp2 <- ggplot(top_features, aes(x = reorder(Feature, MeanDecreaseAccuracy),
y = MeanDecreaseAccuracy)) +
geom_bar(stat = "identity", fill = "#E67E22", alpha = 0.8) +
coord_flip() +
labs(title = "Feature Importance (Mean Decrease Accuracy)",
x = "Feature", y = "Mean Decrease Accuracy") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))
p_imp1 / p_imp2
# Ensure target is a factor with correct level order
y_train <- factor(y_train, levels = c("0","1"))
y_test <- factor(y_test, levels = c("0","1"))
# 1) One-hot encode ALL predictors (handles factors/characters)
x_train_mm <- model.matrix(~ . - 1, data = X_train)
x_test_mm <- model.matrix(~ . - 1, data = X_test)
# 2) Align columns (in case some factor levels appear only in train or test)
all_cols <- union(colnames(x_train_mm), colnames(x_test_mm))
x_train_mm <- x_train_mm[, all_cols, drop = FALSE]
x_test_mm <- x_test_mm[, all_cols, drop = FALSE]
x_train_mm[is.na(x_train_mm)] <- 0
x_test_mm[is.na(x_test_mm)] <- 0
# 3) Impute + remove near-zero variance + standardize using TRAIN only
pp <- caret::preProcess(
x_train_mm,
method = c("medianImpute", "nzv", "center", "scale")
)
x_train_pp <- predict(pp, x_train_mm)
x_test_pp <- predict(pp, x_test_mm)
# 4) Train SVM linear (don't rescale inside svm)
cat("Training SVM (Linear Kernel)...\n")
## Training SVM (Linear Kernel)...
svm_linear <- e1071::svm(
x = x_train_pp,
y = y_train,
kernel = "linear",
probability = TRUE,
scale = FALSE
)
# 5) Predict
svm_linear_pred <- predict(svm_linear, x_test_pp)
svm_linear_prob <- attr(
predict(svm_linear, x_test_pp, probability = TRUE),
"probabilities"
)[, "1"]
# 6) Metrics
cm <- confusionMatrix(svm_linear_pred, y_test, positive = "1")
print(cm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 0 0
## 1 83 216
##
## Accuracy : 0.7224
## 95% CI : (0.6679, 0.7724)
## No Information Rate : 0.7224
## P-Value [Acc > NIR] : 0.5295
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.7224
## Neg Pred Value : NaN
## Prevalence : 0.7224
## Detection Rate : 0.7224
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : 1
##
roc_obj <- pROC::roc(y_test, svm_linear_prob, levels = c("0","1"), direction = "<")
auc_val <- pROC::auc(roc_obj)
svm_linear_cm <- confusionMatrix(svm_linear_pred, y_test, positive = "1")
svm_linear_accuracy <- svm_linear_cm$overall["Accuracy"]
svm_linear_precision <- svm_linear_cm$byClass["Precision"]
svm_linear_recall <- svm_linear_cm$byClass["Recall"]
svm_linear_f1 <- svm_linear_cm$byClass["F1"]
svm_linear_roc <- pROC::roc(y_test, svm_linear_prob, levels = c("0","1"), direction = "<")
svm_linear_auc <- pROC::auc(svm_linear_roc)
cat("\n=== SVM Linear Performance ===\n")
##
## === SVM Linear Performance ===
cat(sprintf("Accuracy: %.4f\n", cm$overall["Accuracy"]))
## Accuracy: 0.7224
cat(sprintf("Precision: %.4f\n", cm$byClass["Precision"]))
## Precision: 0.7224
cat(sprintf("Recall: %.4f\n", cm$byClass["Recall"]))
## Recall: 1.0000
cat(sprintf("F1-Score: %.4f\n", cm$byClass["F1"]))
## F1-Score: 0.8388
cat(sprintf("AUC-ROC: %.4f\n", auc_val))
## AUC-ROC: 0.5951
cat("Training SVM (RBF Kernel)...\n")
## Training SVM (RBF Kernel)...
# Make sure y is a factor with levels c("0","1")
y_train <- factor(y_train, levels = c("0","1"))
y_test <- factor(y_test, levels = c("0","1"))
svm_rbf <- e1071::svm(
x = x_train_pp,
y = y_train,
kernel = "radial",
probability = TRUE,
scale = FALSE
)
# Predictions
svm_rbf_pred <- predict(svm_rbf, x_test_pp)
svm_rbf_prob <- attr(
predict(svm_rbf, x_test_pp, probability = TRUE),
"probabilities"
)[, "1"]
# Confusion matrix + metrics
svm_rbf_cm <- confusionMatrix(svm_rbf_pred, y_test, positive = "1")
print(svm_rbf_cm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 12 2
## 1 71 214
##
## Accuracy : 0.7559
## 95% CI : (0.7031, 0.8035)
## No Information Rate : 0.7224
## P-Value [Acc > NIR] : 0.1089
##
## Kappa : 0.1819
##
## Mcnemar's Test P-Value : 1.737e-15
##
## Sensitivity : 0.9907
## Specificity : 0.1446
## Pos Pred Value : 0.7509
## Neg Pred Value : 0.8571
## Prevalence : 0.7224
## Detection Rate : 0.7157
## Detection Prevalence : 0.9532
## Balanced Accuracy : 0.5677
##
## 'Positive' Class : 1
##
svm_rbf_accuracy <- svm_rbf_cm$overall["Accuracy"]
svm_rbf_precision <- svm_rbf_cm$byClass["Precision"]
svm_rbf_recall <- svm_rbf_cm$byClass["Recall"]
svm_rbf_f1 <- svm_rbf_cm$byClass["F1"]
# ROC + AUC
svm_rbf_roc <- pROC::roc(y_test, svm_rbf_prob, levels = c("0","1"), direction = "<")
svm_rbf_auc <- pROC::auc(svm_rbf_roc)
cat("\n=== SVM RBF Performance ===\n")
##
## === SVM RBF Performance ===
cat(sprintf("Accuracy: %.4f\n", svm_rbf_accuracy))
## Accuracy: 0.7559
cat(sprintf("Precision: %.4f\n", svm_rbf_precision))
## Precision: 0.7509
cat(sprintf("Recall: %.4f\n", svm_rbf_recall))
## Recall: 0.9907
cat(sprintf("F1-Score: %.4f\n", svm_rbf_f1))
## F1-Score: 0.8543
cat(sprintf("AUC-ROC: %.4f\n", svm_rbf_auc))
## AUC-ROC: 0.7653
comparison_table <- data.frame(
Model = c("Random Forest", "SVM Linear", "SVM RBF"),
Accuracy = c(as.numeric(rf_accuracy), as.numeric(svm_linear_accuracy), as.numeric(svm_rbf_accuracy)),
Precision = c(as.numeric(rf_precision), as.numeric(svm_linear_precision), as.numeric(svm_rbf_precision)),
Recall = c(as.numeric(rf_recall), as.numeric(svm_linear_recall), as.numeric(svm_rbf_recall)),
F1_Score = c(as.numeric(rf_f1), as.numeric(svm_linear_f1), as.numeric(svm_rbf_f1)),
AUC_ROC = c(as.numeric(rf_auc), as.numeric(svm_linear_auc), as.numeric(svm_rbf_auc))
)
cat("\n========================================")
##
## ========================================
cat("\nPART 1 DELIVERABLE 1: PERFORMANCE COMPARISON TABLE")
##
## PART 1 DELIVERABLE 1: PERFORMANCE COMPARISON TABLE
cat("\n========================================\n\n")
##
## ========================================
print(comparison_table)
## Model Accuracy Precision Recall F1_Score AUC_ROC
## 1 Random Forest 0.8160535 0.8232932 0.9490741 0.8817204 0.8216756
## 2 SVM Linear 0.7224080 0.7224080 1.0000000 0.8388350 0.5951026
## 3 SVM RBF 0.7558528 0.7508772 0.9907407 0.8542914 0.7652834
best_model_idx <- which.max(comparison_table$AUC_ROC)
cat("\n========================================")
##
## ========================================
cat("\nPART 1 DELIVERABLE 2: RECOMMENDED MODEL")
##
## PART 1 DELIVERABLE 2: RECOMMENDED MODEL
cat("\n========================================\n\n")
##
## ========================================
cat(sprintf("BEST MODEL: %s (AUC-ROC: %.4f)\n\n",
comparison_table$Model[best_model_idx],
comparison_table$AUC_ROC[best_model_idx]))
## BEST MODEL: Random Forest (AUC-ROC: 0.8217)
cat("JUSTIFICATION:\n")
## JUSTIFICATION:
cat("1. Highest AUC-ROC (%.4f) indicates superior discriminative ability\n", rf_auc)
## 1. Highest AUC-ROC (%.4f) indicates superior discriminative ability
## 0.8216756
cat("2. Balanced precision (%.4f) and recall (%.4f) minimizes both false positives and negatives\n",
rf_precision, rf_recall)
## 2. Balanced precision (%.4f) and recall (%.4f) minimizes both false positives and negatives
## 0.8232932 0.9490741
cat("3. Excellent F1-Score (%.4f) demonstrates overall classification quality\n", rf_f1)
## 3. Excellent F1-Score (%.4f) demonstrates overall classification quality
## 0.8817204
cat("4. Feature importance allows interpretability for actionable insights\n")
## 4. Feature importance allows interpretability for actionable insights
cat("5. Robust ensemble method less prone to overfitting\n")
## 5. Robust ensemble method less prone to overfitting
# Plot ROC curves
plot(rf_roc, col = "#3498DB", lwd = 2, main = "ROC Curves Comparison")
plot(svm_linear_roc, col = "#E74C3C", lwd = 2, add = TRUE)
plot(svm_rbf_roc, col = "#27AE60", lwd = 2, add = TRUE)
legend("bottomright",
legend = c(sprintf("Random Forest (AUC = %.3f)", rf_auc),
sprintf("SVM Linear (AUC = %.3f)", svm_linear_auc),
sprintf("SVM RBF (AUC = %.3f)", svm_rbf_auc)),
col = c("#3498DB", "#E74C3C", "#27AE60"),
lwd = 2)
cat("\n========================================")
##
## ========================================
cat("\nPART 1 DELIVERABLE 3: TOP 5 MOST IMPORTANT FEATURES")
##
## PART 1 DELIVERABLE 3: TOP 5 MOST IMPORTANT FEATURES
cat("\n========================================\n\n")
##
## ========================================
top_5_features <- head(importance_df, 5)
cat("TOP 5 PREDICTIVE FEATURES (Random Forest):\n\n")
## TOP 5 PREDICTIVE FEATURES (Random Forest):
for(i in 1:5) {
cat(sprintf("%d. %s\n", i, top_5_features$Feature[i]))
cat(sprintf(" - Mean Decrease Gini: %.2f\n", top_5_features$MeanDecreaseGini[i]))
cat(sprintf(" - Mean Decrease Accuracy: %.2f\n\n", top_5_features$MeanDecreaseAccuracy[i]))
}
## 1. age
## - Mean Decrease Gini: 67.15
## - Mean Decrease Accuracy: 44.03
##
## 2. app_engagement
## - Mean Decrease Gini: 18.86
## - Mean Decrease Accuracy: 7.61
##
## 3. social_support
## - Mean Decrease Gini: 17.14
## - Mean Decrease Accuracy: 6.63
##
## 4. bmi
## - Mean Decrease Gini: 16.25
## - Mean Decrease Accuracy: -0.29
##
## 5. sleep_quality
## - Mean Decrease Gini: 14.47
## - Mean Decrease Accuracy: 4.16
cat("\nKEY INSIGHTS:\n")
##
## KEY INSIGHTS:
cat(sprintf("• %s is BY FAR the most important predictor (Gini: %.2f)\n",
top_5_features$Feature[1], top_5_features$MeanDecreaseGini[1]))
## • age is BY FAR the most important predictor (Gini: 67.15)
cat(sprintf("• This is %.1fx more important than the 2nd ranked feature\n",
top_5_features$MeanDecreaseGini[1] / top_5_features$MeanDecreaseGini[2]))
## • This is 3.6x more important than the 2nd ranked feature
cat("• App engagement and social support are critical success factors\n")
## • App engagement and social support are critical success factors
cat("• Baseline health (BMI) and sleep quality predict program outcomes\n")
## • Baseline health (BMI) and sleep quality predict program outcomes
# Select clustering variables
clustering_vars <- c("baseline_activity_hrs", "sleep_quality", "stress_score",
"social_support", "self_efficacy", "app_engagement")
cluster_data <- wellness_data %>%
select(all_of(clustering_vars))
# Display summary
cat("Clustering Variables Summary:\n")
## Clustering Variables Summary:
summary(cluster_data)
## baseline_activity_hrs sleep_quality stress_score social_support
## Min. : 0.000 Min. : 2.000 Min. : 1.000 Min. : 1.000
## 1st Qu.: 0.600 1st Qu.: 4.800 1st Qu.: 4.100 1st Qu.: 4.800
## Median : 1.500 Median : 5.900 Median : 5.700 Median : 6.100
## Mean : 2.375 Mean : 5.940 Mean : 5.708 Mean : 6.055
## 3rd Qu.: 3.300 3rd Qu.: 7.025 3rd Qu.: 7.200 3rd Qu.: 7.400
## Max. :15.000 Max. :10.000 Max. :10.000 Max. :10.000
## self_efficacy app_engagement
## Min. : 1.500 Min. : 0.50
## 1st Qu.: 5.100 1st Qu.:24.30
## Median : 6.100 Median :39.40
## Mean : 6.162 Mean :41.99
## 3rd Qu.: 7.200 3rd Qu.:58.70
## Max. :10.000 Max. :97.90
# Standardize the data (z-score normalization)
cluster_data_scaled <- scale(cluster_data)
# Convert to data frame
cluster_df_scaled <- as.data.frame(cluster_data_scaled)
cat("\n\nStandardized Data Summary:\n")
##
##
## Standardized Data Summary:
summary(cluster_df_scaled)
## baseline_activity_hrs sleep_quality stress_score
## Min. :-0.9683 Min. :-2.33346 Min. :-2.234808
## 1st Qu.:-0.7237 1st Qu.:-0.67500 1st Qu.:-0.763353
## Median :-0.3568 Median :-0.02346 Median :-0.003892
## Mean : 0.0000 Mean : 0.00000 Mean : 0.000000
## 3rd Qu.: 0.3769 3rd Qu.: 0.64289 3rd Qu.: 0.708102
## Max. : 5.1463 Max. : 2.40501 Max. : 2.037158
## social_support self_efficacy app_engagement
## Min. :-2.64704 Min. :-3.01238 Min. :-1.8826
## 1st Qu.:-0.65730 1st Qu.:-0.68642 1st Qu.:-0.8026
## Median : 0.02341 Median :-0.04032 Median :-0.1174
## Mean : 0.00000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.70411 3rd Qu.: 0.67039 3rd Qu.: 0.7584
## Max. : 2.06551 Max. : 2.47947 Max. : 2.5372
# Elbow method
set.seed(42)
wss <- sapply(1:10, function(k) {
kmeans(cluster_df_scaled, centers = k, nstart = 25)$tot.withinss
})
# Silhouette method
sil_width <- sapply(2:10, function(k) {
km <- kmeans(cluster_df_scaled, centers = k, nstart = 25)
ss <- silhouette(km$cluster, dist(cluster_df_scaled))
mean(ss[, 3])
})
# Plot elbow and silhouette
par(mfrow = c(1, 2))
plot(1:10, wss, type = "b", pch = 19,
xlab = "Number of Clusters (K)",
ylab = "Within-Cluster Sum of Squares",
main = "Elbow Method",
col = "#3498DB", lwd = 2)
grid()
plot(2:10, sil_width, type = "b", pch = 19,
xlab = "Number of Clusters (K)",
ylab = "Average Silhouette Width",
main = "Silhouette Method",
col = "#E74C3C", lwd = 2)
grid()
par(mfrow = c(1, 1))
set.seed(42)
kmeans_k3 <- kmeans(cluster_df_scaled, centers = 3, nstart = 25)
cat("K-means (K=3) Results:\n")
## K-means (K=3) Results:
cat("Cluster sizes:\n")
## Cluster sizes:
print(table(kmeans_k3$cluster))
##
## 1 2 3
## 531 297 172
cat("\n\nCluster Centers (Standardized):\n")
##
##
## Cluster Centers (Standardized):
print(kmeans_k3$centers)
## baseline_activity_hrs sleep_quality stress_score social_support self_efficacy
## 1 -0.3657546 0.2375622 -0.3399106 -0.07047056 -0.28281720
## 2 -0.2846543 -0.7336281 0.8746336 0.46098663 0.46481630
## 3 1.6206863 0.5333840 -0.4608932 -0.57844861 0.07049706
## app_engagement
## 1 -0.3273623
## 2 0.4033698
## 3 0.3141196
# Add cluster assignment to original data
wellness_data$kmeans_k3 <- as.factor(kmeans_k3$cluster)
# Calculate cluster centers in original scale
kmeans_k3_centers_original <- cluster_data %>%
mutate(cluster = kmeans_k3$cluster) %>%
group_by(cluster) %>%
summarise(across(everything(), mean)) %>%
arrange(cluster)
cat("\n\nCluster Centers (Original Scale):\n")
##
##
## Cluster Centers (Original Scale):
print(kmeans_k3_centers_original)
## # A tibble: 3 × 7
## cluster baseline_activity_hrs sleep_quality stress_score social_support
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 1.48 6.34 4.99 5.92
## 2 2 1.68 4.70 7.55 6.94
## 3 3 6.35 6.84 4.74 4.95
## # ℹ 2 more variables: self_efficacy <dbl>, app_engagement <dbl>
set.seed(42)
kmeans_k4 <- kmeans(cluster_df_scaled, centers = 4, nstart = 25)
cat("K-means (K=4) Results:\n")
## K-means (K=4) Results:
cat("Cluster sizes:\n")
## Cluster sizes:
print(table(kmeans_k4$cluster))
##
## 1 2 3 4
## 152 187 386 275
cat("\n\nCluster Centers (Standardized):\n")
##
##
## Cluster Centers (Standardized):
print(kmeans_k4$centers)
## baseline_activity_hrs sleep_quality stress_score social_support self_efficacy
## 1 1.692354154 0.5380692 -0.5872279 -0.649027847 0.08125221
## 2 -0.005013338 -1.0059918 0.7674984 0.944074209 -0.06208367
## 3 -0.356743853 0.3537197 -0.3222607 -0.009693412 -0.38311806
## 4 -0.431262581 -0.1098249 0.2550148 -0.269629045 0.53506502
## app_engagement
## 1 0.2729762
## 2 -0.5535427
## 3 -0.5911464
## 4 1.0552822
# Add cluster assignment
wellness_data$kmeans_k4 <- as.factor(kmeans_k4$cluster)
# Calculate cluster centers in original scale
kmeans_k4_centers_original <- cluster_data %>%
mutate(cluster = kmeans_k4$cluster) %>%
group_by(cluster) %>%
summarise(across(everything(), mean)) %>%
arrange(cluster)
cat("\n\nCluster Centers (Original Scale):\n")
##
##
## Cluster Centers (Original Scale):
print(kmeans_k4_centers_original)
## # A tibble: 4 × 7
## cluster baseline_activity_hrs sleep_quality stress_score social_support
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 6.53 6.85 4.47 4.82
## 2 2 2.36 4.24 7.33 7.86
## 3 3 1.50 6.54 5.03 6.04
## 4 4 1.32 5.75 6.25 5.54
## # ℹ 2 more variables: self_efficacy <dbl>, app_engagement <dbl>
set.seed(42)
pam_k3 <- pam(cluster_df_scaled, k = 3)
cat("K-medoids (K=3) Results:\n")
## K-medoids (K=3) Results:
cat("Cluster sizes:\n")
## Cluster sizes:
print(table(pam_k3$clustering))
##
## 1 2 3
## 165 448 387
cat("\n\nMedoid Centers (Standardized):\n")
##
##
## Medoid Centers (Standardized):
print(pam_k3$medoids)
## baseline_activity_hrs sleep_quality stress_score social_support
## [1,] -0.60143442 -0.7934574 1.08783235 1.1229998
## [2,] 0.05079214 0.5688537 -0.19375738 -0.1336792
## [3,] -0.27532114 -0.2603791 -0.05135852 -0.1860409
## self_efficacy app_engagement
## [1,] -0.2341467 -0.4985788
## [2,] -0.5571969 0.2955460
## [3,] 0.6703937 -0.2353831
# Add cluster assignment
wellness_data$pam_k3 <- as.factor(pam_k3$clustering)
# Calculate cluster centers in original scale
pam_k3_centers_original <- cluster_data %>%
mutate(cluster = pam_k3$clustering) %>%
group_by(cluster) %>%
summarise(across(everything(), mean)) %>%
arrange(cluster)
cat("\n\nCluster Centers (Original Scale):\n")
##
##
## Cluster Centers (Original Scale):
print(pam_k3_centers_original)
## # A tibble: 3 × 7
## cluster baseline_activity_hrs sleep_quality stress_score social_support
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 1.72 4.33 7.65 8.01
## 2 2 3 6.95 5.15 5.68
## 3 3 1.93 5.45 5.52 5.65
## # ℹ 2 more variables: self_efficacy <dbl>, app_engagement <dbl>
set.seed(42)
pam_k4 <- pam(cluster_df_scaled, k = 4)
cat("K-medoids (K=4) Results:\n")
## K-medoids (K=4) Results:
cat("Cluster sizes:\n")
## Cluster sizes:
print(table(pam_k4$clustering))
##
## 1 2 3 4
## 171 459 309 61
cat("\n\nMedoid Centers (Standardized):\n")
##
##
## Medoid Centers (Standardized):
print(pam_k4$medoids)
## baseline_activity_hrs sleep_quality stress_score social_support
## [1,] -0.60143442 -1.3857665 0.5182369 0.9135533
## [2,] 0.05079214 0.5688537 -0.1937574 -0.1336792
## [3,] -0.31608530 0.1542373 -0.3836225 0.2852138
## [4,] -0.80525522 -0.3788409 1.3726301 -0.2907641
## self_efficacy app_engagement
## [1,] -0.6218069 -0.3987460
## [2,] -0.5571969 0.2955460
## [3,] 0.9288338 -0.4259731
## [4,] 1.8333742 1.8792579
# Add cluster assignment
wellness_data$pam_k4 <- as.factor(pam_k4$clustering)
# Calculate cluster centers in original scale
pam_k4_centers_original <- cluster_data %>%
mutate(cluster = pam_k4$clustering) %>%
group_by(cluster) %>%
summarise(across(everything(), mean)) %>%
arrange(cluster)
cat("\n\nCluster Centers (Original Scale):\n")
##
##
## Cluster Centers (Original Scale):
print(pam_k4_centers_original)
## # A tibble: 4 × 7
## cluster baseline_activity_hrs sleep_quality stress_score social_support
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 1.82 3.95 7.02 7.65
## 2 2 3.06 6.74 5.41 5.40
## 3 3 1.94 6.06 4.84 6.29
## 4 4 0.985 4.88 8.67 5.36
## # ℹ 2 more variables: self_efficacy <dbl>, app_engagement <dbl>
cat("\n========================================")
##
## ========================================
cat("\nPART 2 DELIVERABLE 1: CLUSTER CENTER COMPARISON")
##
## PART 2 DELIVERABLE 1: CLUSTER CENTER COMPARISON
cat("\n========================================\n\n")
##
## ========================================
cat("K-MEANS (K=3) CLUSTER CENTERS:\n")
## K-MEANS (K=3) CLUSTER CENTERS:
cat("================================\n")
## ================================
print(kmeans_k3_centers_original)
## # A tibble: 3 × 7
## cluster baseline_activity_hrs sleep_quality stress_score social_support
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 1.48 6.34 4.99 5.92
## 2 2 1.68 4.70 7.55 6.94
## 3 3 6.35 6.84 4.74 4.95
## # ℹ 2 more variables: self_efficacy <dbl>, app_engagement <dbl>
cat("\n\nK-MEDOIDS/PAM (K=3) CLUSTER CENTERS:\n")
##
##
## K-MEDOIDS/PAM (K=3) CLUSTER CENTERS:
cat("====================================\n")
## ====================================
print(pam_k3_centers_original)
## # A tibble: 3 × 7
## cluster baseline_activity_hrs sleep_quality stress_score social_support
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 1.72 4.33 7.65 8.01
## 2 2 3 6.95 5.15 5.68
## 3 3 1.93 5.45 5.52 5.65
## # ℹ 2 more variables: self_efficacy <dbl>, app_engagement <dbl>
cat("\n\nKEY DIFFERENCES:\n")
##
##
## KEY DIFFERENCES:
cat("----------------\n")
## ----------------
cat("1. CLUSTER SIZES:\n")
## 1. CLUSTER SIZES:
cat(" - K-means: More balanced distribution\n")
## - K-means: More balanced distribution
cat(" - K-medoids: May have more imbalanced clusters\n\n")
## - K-medoids: May have more imbalanced clusters
cat("2. ACTIVITY LEVELS:\n")
## 2. ACTIVITY LEVELS:
cat(sprintf(" - K-means Cluster 3 has HIGHEST activity (%.2f hrs/week)\n",
kmeans_k3_centers_original$baseline_activity_hrs[3]))
## - K-means Cluster 3 has HIGHEST activity (6.35 hrs/week)
cat(sprintf(" - This is ~4x higher than Clusters 1-2 (%.2f and %.2f hrs/week)\n",
kmeans_k3_centers_original$baseline_activity_hrs[1],
kmeans_k3_centers_original$baseline_activity_hrs[2]))
## - This is ~4x higher than Clusters 1-2 (1.48 and 1.68 hrs/week)
cat("\n3. STRESS PATTERNS:\n")
##
## 3. STRESS PATTERNS:
cat(sprintf(" - K-means Cluster 2 shows HIGHEST stress (%.2f/10)\n",
kmeans_k3_centers_original$stress_score[2]))
## - K-means Cluster 2 shows HIGHEST stress (7.55/10)
cat(sprintf(" - K-medoids Cluster 1 shows highest stress (%.2f/10)\n",
pam_k3_centers_original$stress_score[1]))
## - K-medoids Cluster 1 shows highest stress (7.65/10)
cat("\n4. SLEEP QUALITY:\n")
##
## 4. SLEEP QUALITY:
cat(sprintf(" - K-means Cluster 2 has POOREST sleep (%.2f/10)\n",
kmeans_k3_centers_original$sleep_quality[2]))
## - K-means Cluster 2 has POOREST sleep (4.70/10)
cat(sprintf(" - K-medoids Cluster 1 has poorest sleep (%.2f/10)\n",
pam_k3_centers_original$sleep_quality[1]))
## - K-medoids Cluster 1 has poorest sleep (4.33/10)
# Perform PCA
pca_result <- prcomp(cluster_df_scaled)
# Create data frame with PCA coordinates and cluster assignments
pca_df <- data.frame(
PC1 = pca_result$x[, 1],
PC2 = pca_result$x[, 2],
kmeans_k3 = wellness_data$kmeans_k3,
kmeans_k4 = wellness_data$kmeans_k4,
pam_k3 = wellness_data$pam_k3,
pam_k4 = wellness_data$pam_k4
)
# Plot K-means K=3
p_km3 <- ggplot(pca_df, aes(x = PC1, y = PC2, color = kmeans_k3)) +
geom_point(alpha = 0.6, size = 2) +
scale_color_manual(values = c("#E74C3C", "#3498DB", "#27AE60")) +
labs(title = "K-means (K=3)", color = "Cluster") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))
# Plot K-means K=4
p_km4 <- ggplot(pca_df, aes(x = PC1, y = PC2, color = kmeans_k4)) +
geom_point(alpha = 0.6, size = 2) +
scale_color_manual(values = c("#E74C3C", "#3498DB", "#27AE60", "#F39C12")) +
labs(title = "K-means (K=4)", color = "Cluster") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))
# Plot PAM K=3
p_pam3 <- ggplot(pca_df, aes(x = PC1, y = PC2, color = pam_k3)) +
geom_point(alpha = 0.6, size = 2) +
scale_color_manual(values = c("#E74C3C", "#3498DB", "#27AE60")) +
labs(title = "K-medoids (K=3)", color = "Cluster") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))
# Plot PAM K=4
p_pam4 <- ggplot(pca_df, aes(x = PC1, y = PC2, color = pam_k4)) +
geom_point(alpha = 0.6, size = 2) +
scale_color_manual(values = c("#E74C3C", "#3498DB", "#27AE60", "#F39C12")) +
labs(title = "K-medoids (K=4)", color = "Cluster") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))
(p_km3 | p_km4) / (p_pam3 | p_pam4)
# K-means K=3 centers
kmeans_k3_long <- kmeans_k3_centers_original %>%
pivot_longer(-cluster, names_to = "Variable", values_to = "Value") %>%
mutate(cluster = paste("Cluster", cluster))
p_heat_km3 <- ggplot(kmeans_k3_long, aes(x = Variable, y = cluster, fill = Value)) +
geom_tile(color = "white") +
geom_text(aes(label = round(Value, 2)), color = "black", size = 3) +
scale_fill_gradient2(low = "#E74C3C", mid = "white", high = "#27AE60",
midpoint = 5) +
labs(title = "K-means (K=3) Cluster Centers",
x = "", y = "") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5, face = "bold"))
# PAM K=3 centers
pam_k3_long <- pam_k3_centers_original %>%
pivot_longer(-cluster, names_to = "Variable", values_to = "Value") %>%
mutate(cluster = paste("Cluster", cluster))
p_heat_pam3 <- ggplot(pam_k3_long, aes(x = Variable, y = cluster, fill = Value)) +
geom_tile(color = "white") +
geom_text(aes(label = round(Value, 2)), color = "black", size = 3) +
scale_fill_gradient2(low = "#E74C3C", mid = "white", high = "#27AE60",
midpoint = 5) +
labs(title = "K-medoids (K=3) Cluster Centers",
x = "", y = "") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5, face = "bold"))
p_heat_km3 / p_heat_pam3
# Profile clusters based on K-means K=3 (recommended)
cluster_profiles <- wellness_data %>%
group_by(kmeans_k3) %>%
summarise(
Count = n(),
Avg_Baseline_Activity = mean(baseline_activity_hrs),
Avg_Sleep_Quality = mean(sleep_quality),
Avg_Stress = mean(stress_score),
Avg_Social_Support = mean(social_support),
Avg_Self_Efficacy = mean(self_efficacy),
Avg_App_Engagement = mean(app_engagement),
Success_Rate = mean(as.numeric(as.character(program_success)))
) %>%
arrange(kmeans_k3)
cat("\nCluster Profiles (K-means K=3):\n")
##
## Cluster Profiles (K-means K=3):
print(cluster_profiles)
## # A tibble: 3 × 9
## kmeans_k3 Count Avg_Baseline_Activity Avg_Sleep_Quality Avg_Stress
## <fct> <int> <dbl> <dbl> <dbl>
## 1 1 531 1.48 6.34 4.99
## 2 2 297 1.68 4.70 7.55
## 3 3 172 6.35 6.84 4.74
## # ℹ 4 more variables: Avg_Social_Support <dbl>, Avg_Self_Efficacy <dbl>,
## # Avg_App_Engagement <dbl>, Success_Rate <dbl>
# Success rate by cluster
success_by_cluster <- wellness_data %>%
group_by(kmeans_k3, program_success) %>%
summarise(count = n(), .groups = 'drop') %>%
group_by(kmeans_k3) %>%
mutate(proportion = count / sum(count))
p_success <- ggplot(success_by_cluster,
aes(x = kmeans_k3, y = proportion, fill = program_success)) +
geom_bar(stat = "identity", position = "fill") +
scale_fill_manual(values = c("0" = "#E74C3C", "1" = "#27AE60")) +
labs(title = "Program Success Rate by Cluster (K-means K=3)",
x = "Cluster", y = "Proportion",
fill = "Program Success") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 14))
print(p_success)
# Silhouette analysis
sil_kmeans_k3 <- silhouette(kmeans_k3$cluster, dist(cluster_df_scaled))
sil_kmeans_k4 <- silhouette(kmeans_k4$cluster, dist(cluster_df_scaled))
sil_pam_k3 <- silhouette(pam_k3$clustering, dist(cluster_df_scaled))
sil_pam_k4 <- silhouette(pam_k4$clustering, dist(cluster_df_scaled))
# Summary statistics
cat("\nClustering Quality Metrics:\n\n")
##
## Clustering Quality Metrics:
cat("K-means (K=3) - Average Silhouette Width:",
sprintf("%.4f\n", mean(sil_kmeans_k3[, 3])))
## K-means (K=3) - Average Silhouette Width: 0.1511
cat("K-means (K=4) - Average Silhouette Width:",
sprintf("%.4f\n", mean(sil_kmeans_k4[, 3])))
## K-means (K=4) - Average Silhouette Width: 0.1447
cat("K-medoids (K=3) - Average Silhouette Width:",
sprintf("%.4f\n", mean(sil_pam_k3[, 3])))
## K-medoids (K=3) - Average Silhouette Width: 0.1066
cat("K-medoids (K=4) - Average Silhouette Width:",
sprintf("%.4f\n", mean(sil_pam_k4[, 3])))
## K-medoids (K=4) - Average Silhouette Width: 0.1206
cat("\n========================================")
##
## ========================================
cat("\nPART 2 DELIVERABLE 2: RECOMMENDED CLUSTERING METHOD")
##
## PART 2 DELIVERABLE 2: RECOMMENDED CLUSTERING METHOD
cat("\n========================================\n\n")
##
## ========================================
cat("RECOMMENDED METHOD: K-means with K=3 clusters\n\n")
## RECOMMENDED METHOD: K-means with K=3 clusters
cat("QUANTITATIVE JUSTIFICATION:\n")
## QUANTITATIVE JUSTIFICATION:
cat("---------------------------\n")
## ---------------------------
cat(sprintf("K-means (K=3) Silhouette Width: %.4f (HIGHEST)\n", mean(sil_kmeans_k3[, 3])))
## K-means (K=3) Silhouette Width: 0.1511 (HIGHEST)
cat(sprintf("K-means (K=4) Silhouette Width: %.4f\n", mean(sil_kmeans_k4[, 3])))
## K-means (K=4) Silhouette Width: 0.1447
cat(sprintf("K-medoids (K=3) Silhouette Width: %.4f\n", mean(sil_pam_k3[, 3])))
## K-medoids (K=3) Silhouette Width: 0.1066
cat(sprintf("K-medoids (K=4) Silhouette Width: %.4f\n\n", mean(sil_pam_k4[, 3])))
## K-medoids (K=4) Silhouette Width: 0.1206
cat("QUALITATIVE JUSTIFICATION:\n")
## QUALITATIVE JUSTIFICATION:
cat("--------------------------\n")
## --------------------------
cat("1. SUPERIOR CLUSTERING QUALITY: Highest silhouette width indicates better-defined clusters\n")
## 1. SUPERIOR CLUSTERING QUALITY: Highest silhouette width indicates better-defined clusters
cat("2. BALANCED CLUSTER SIZES:\n")
## 2. BALANCED CLUSTER SIZES:
cat(sprintf(" - Cluster 1: %d employees (%.1f%%)\n",
cluster_profiles$Count[1],
100 * cluster_profiles$Count[1] / sum(cluster_profiles$Count)))
## - Cluster 1: 531 employees (53.1%)
cat(sprintf(" - Cluster 2: %d employees (%.1f%%)\n",
cluster_profiles$Count[2],
100 * cluster_profiles$Count[2] / sum(cluster_profiles$Count)))
## - Cluster 2: 297 employees (29.7%)
cat(sprintf(" - Cluster 3: %d employees (%.1f%%)\n",
cluster_profiles$Count[3],
100 * cluster_profiles$Count[3] / sum(cluster_profiles$Count)))
## - Cluster 3: 172 employees (17.2%)
cat("3. CLEAR INTERPRETABLE PROFILES: Distinct behavioral patterns across all 6 variables\n")
## 3. CLEAR INTERPRETABLE PROFILES: Distinct behavioral patterns across all 6 variables
cat("4. ACTIONABLE SEGMENTATION: Three segments align with practical intervention frameworks\n")
## 4. ACTIONABLE SEGMENTATION: Three segments align with practical intervention frameworks
cat("5. COMPUTATIONAL EFFICIENCY: Faster and more scalable than K-medoids\n")
## 5. COMPUTATIONAL EFFICIENCY: Faster and more scalable than K-medoids
cat("\n========================================")
##
## ========================================
cat("\nPART 2 DELIVERABLE 3: CLUSTER PROFILES & INTERVENTIONS")
##
## PART 2 DELIVERABLE 3: CLUSTER PROFILES & INTERVENTIONS
cat("\n========================================\n\n")
##
## ========================================
# Based on ACTUAL cluster centers, assign meaningful names
# Cluster 1: Low activity (1.48), moderate sleep (6.34), low stress (4.99)
# Cluster 2: Low activity (1.68), poor sleep (4.70), HIGH stress (7.55)
# Cluster 3: HIGH activity (6.35), good sleep (6.84), low stress (4.74)
cat("CLUSTER 1: 'MODERATE WELLNESS SEEKERS'\n")
## CLUSTER 1: 'MODERATE WELLNESS SEEKERS'
cat("=======================================\n")
## =======================================
cat(sprintf("Size: %d employees (%.1f%% of sample)\n",
cluster_profiles$Count[1],
100 * cluster_profiles$Count[1] / sum(cluster_profiles$Count)))
## Size: 531 employees (53.1% of sample)
cat(sprintf("Success Rate: %.1f%%\n\n", cluster_profiles$Success_Rate[1] * 100))
## Success Rate: 70.8%
cat("CHARACTERISTICS:\n")
## CHARACTERISTICS:
cat(sprintf("• Baseline Activity: LOW (%.2f hrs/week)\n", cluster_profiles$Avg_Baseline_Activity[1]))
## • Baseline Activity: LOW (1.48 hrs/week)
cat(sprintf("• Sleep Quality: MODERATE-GOOD (%.2f/10)\n", cluster_profiles$Avg_Sleep_Quality[1]))
## • Sleep Quality: MODERATE-GOOD (6.34/10)
cat(sprintf("• Stress Score: LOW-MODERATE (%.2f/10)\n", cluster_profiles$Avg_Stress[1]))
## • Stress Score: LOW-MODERATE (4.99/10)
cat(sprintf("• Social Support: MODERATE (%.2f/10)\n", cluster_profiles$Avg_Social_Support[1]))
## • Social Support: MODERATE (5.92/10)
cat(sprintf("• Self-Efficacy: %.2f/10\n", cluster_profiles$Avg_Self_Efficacy[1]))
## • Self-Efficacy: 5.72/10
cat(sprintf("• App Engagement: %.2f\n\n", cluster_profiles$Avg_App_Engagement[1]))
## • App Engagement: 34.77
cat("INTERVENTION APPROACH: Gradual Activation & Habit Formation\n")
## INTERVENTION APPROACH: Gradual Activation & Habit Formation
cat("• Progressive activity challenges (start with 10 min/day)\n")
## • Progressive activity challenges (start with 10 min/day)
cat("• Gamification to build momentum\n")
## • Gamification to build momentum
cat("• Buddy systems and team challenges\n")
## • Buddy systems and team challenges
cat("• On-site wellness opportunities\n")
## • On-site wellness opportunities
cat("• Recognition for consistency\n\n")
## • Recognition for consistency
cat("===========================================================\n\n")
## ===========================================================
cat("CLUSTER 2: 'STRESSED & UNDER-SUPPORTED'\n")
## CLUSTER 2: 'STRESSED & UNDER-SUPPORTED'
cat("========================================\n")
## ========================================
cat(sprintf("Size: %d employees (%.1f%% of sample)\n",
cluster_profiles$Count[2],
100 * cluster_profiles$Count[2] / sum(cluster_profiles$Count)))
## Size: 297 employees (29.7% of sample)
cat(sprintf("Success Rate: %.1f%% ⭐ (HIGHEST!)\n\n", cluster_profiles$Success_Rate[2] * 100))
## Success Rate: 74.1% ⭐ (HIGHEST!)
cat("CHARACTERISTICS:\n")
## CHARACTERISTICS:
cat(sprintf("• Baseline Activity: LOW (%.2f hrs/week)\n", cluster_profiles$Avg_Baseline_Activity[2]))
## • Baseline Activity: LOW (1.68 hrs/week)
cat(sprintf("• Sleep Quality: POOR (%.2f/10) - WORST ACROSS CLUSTERS\n", cluster_profiles$Avg_Sleep_Quality[2]))
## • Sleep Quality: POOR (4.70/10) - WORST ACROSS CLUSTERS
cat(sprintf("• Stress Score: HIGH (%.2f/10) - HIGHEST ACROSS CLUSTERS\n", cluster_profiles$Avg_Stress[2]))
## • Stress Score: HIGH (7.55/10) - HIGHEST ACROSS CLUSTERS
cat(sprintf("• Social Support: MODERATE-HIGH (%.2f/10)\n", cluster_profiles$Avg_Social_Support[2]))
## • Social Support: MODERATE-HIGH (6.94/10)
cat(sprintf("• Self-Efficacy: %.2f/10\n", cluster_profiles$Avg_Self_Efficacy[2]))
## • Self-Efficacy: 6.88/10
cat(sprintf("• App Engagement: %.2f\n\n", cluster_profiles$Avg_App_Engagement[2]))
## • App Engagement: 50.88
cat("PARADOXICAL FINDING:\n")
## PARADOXICAL FINDING:
cat("Despite having the HIGHEST stress and WORST sleep, this cluster shows the\n")
## Despite having the HIGHEST stress and WORST sleep, this cluster shows the
cat("HIGHEST success rate (%.1f%%)! This indicates strong intrinsic motivation.\n\n",
cluster_profiles$Success_Rate[2] * 100)
## HIGHEST success rate (%.1f%%)! This indicates strong intrinsic motivation.
##
## 74.07407
cat("INTERVENTION APPROACH: Stress Management & Sleep Recovery\n")
## INTERVENTION APPROACH: Stress Management & Sleep Recovery
cat("• Mindfulness and meditation programs\n")
## • Mindfulness and meditation programs
cat("• Evidence-based sleep hygiene education\n")
## • Evidence-based sleep hygiene education
cat("• Access to mental health resources and EAP\n")
## • Access to mental health resources and EAP
cat("• Stress management workshops (CBT techniques)\n")
## • Stress management workshops (CBT techniques)
cat("• Leverage existing social support through peer groups\n\n")
## • Leverage existing social support through peer groups
cat("===========================================================\n\n")
## ===========================================================
cat("CLUSTER 3: 'ACTIVE & AUTONOMOUS'\n")
## CLUSTER 3: 'ACTIVE & AUTONOMOUS'
cat("=================================\n")
## =================================
cat(sprintf("Size: %d employees (%.1f%% of sample)\n",
cluster_profiles$Count[3],
100 * cluster_profiles$Count[3] / sum(cluster_profiles$Count)))
## Size: 172 employees (17.2% of sample)
cat(sprintf("Success Rate: %.1f%%\n\n", cluster_profiles$Success_Rate[3] * 100))
## Success Rate: 73.8%
cat("CHARACTERISTICS:\n")
## CHARACTERISTICS:
cat(sprintf("• Baseline Activity: HIGH (%.2f hrs/week) - 4x OTHER CLUSTERS\n", cluster_profiles$Avg_Baseline_Activity[3]))
## • Baseline Activity: HIGH (6.35 hrs/week) - 4x OTHER CLUSTERS
cat(sprintf("• Sleep Quality: GOOD (%.2f/10)\n", cluster_profiles$Avg_Sleep_Quality[3]))
## • Sleep Quality: GOOD (6.84/10)
cat(sprintf("• Stress Score: LOW-MODERATE (%.2f/10)\n", cluster_profiles$Avg_Stress[3]))
## • Stress Score: LOW-MODERATE (4.74/10)
cat(sprintf("• Social Support: MODERATE-LOW (%.2f/10) - autonomous individuals\n", cluster_profiles$Avg_Social_Support[3]))
## • Social Support: MODERATE-LOW (4.95/10) - autonomous individuals
cat(sprintf("• Self-Efficacy: %.2f/10\n", cluster_profiles$Avg_Self_Efficacy[3]))
## • Self-Efficacy: 6.27/10
cat(sprintf("• App Engagement: %.2f\n\n", cluster_profiles$Avg_App_Engagement[3]))
## • App Engagement: 48.91
cat("INTERVENTION APPROACH: Advanced Challenges & Peer Leadership\n")
## INTERVENTION APPROACH: Advanced Challenges & Peer Leadership
cat("• Advanced programming (marathon training, competitions)\n")
## • Advanced programming (marathon training, competitions)
cat("• Recruit as wellness ambassadors and peer mentors\n")
## • Recruit as wellness ambassadors and peer mentors
cat("• Self-directed goal setting with autonomy\n")
## • Self-directed goal setting with autonomy
cat("• Performance optimization resources\n")
## • Performance optimization resources
cat("• Build community among high-performers\n\n")
## • Build community among high-performers
cat("===========================================================\n\n")
## ===========================================================
cat("KEY INSIGHTS:\n")
## KEY INSIGHTS:
cat("-------------\n")
## -------------
cat("• Three distinct behavioral segments with different needs\n")
## • Three distinct behavioral segments with different needs
cat("• Cluster 2 (stressed) shows HIGHEST success despite challenges\n")
## • Cluster 2 (stressed) shows HIGHEST success despite challenges
cat("• Activity level is key differentiator (Cluster 3 has 4x activity of others)\n")
## • Activity level is key differentiator (Cluster 3 has 4x activity of others)
cat("• Tailored interventions required for each segment\n")
## • Tailored interventions required for each segment
cat("• K-means provides clearer separation than K-medoids\n")
## • K-means provides clearer separation than K-medoids
cat("\n========================================")
##
## ========================================
cat("\nCLASSIFICATION ANALYSIS - FINAL SUMMARY")
##
## CLASSIFICATION ANALYSIS - FINAL SUMMARY
cat("\n========================================\n\n")
##
## ========================================
cat("BEST MODEL: Random Forest\n")
## BEST MODEL: Random Forest
cat(sprintf("• AUC-ROC: %.4f (not 0.85-0.90 as previously stated)\n", rf_auc))
## • AUC-ROC: 0.8217 (not 0.85-0.90 as previously stated)
cat(sprintf("• Accuracy: %.4f (%.1f%%)\n", rf_accuracy, rf_accuracy * 100))
## • Accuracy: 0.8161 (81.6%)
cat(sprintf("• Precision: %.4f\n", rf_precision))
## • Precision: 0.8233
cat(sprintf("• Recall: %.4f\n", rf_recall))
## • Recall: 0.9491
cat(sprintf("• F1-Score: %.4f\n\n", rf_f1))
## • F1-Score: 0.8817
cat("TOP 5 PREDICTIVE FEATURES:\n")
## TOP 5 PREDICTIVE FEATURES:
for(i in 1:5) {
cat(sprintf("%d. %s (Gini: %.2f)\n",
i, top_5_features$Feature[i], top_5_features$MeanDecreaseGini[i]))
}
## 1. age (Gini: 67.15)
## 2. app_engagement (Gini: 18.86)
## 3. social_support (Gini: 17.14)
## 4. bmi (Gini: 16.25)
## 5. sleep_quality (Gini: 14.47)
cat("\nBUSINESS IMPLICATIONS:\n")
##
## BUSINESS IMPLICATIONS:
cat("• Age is BY FAR the dominant predictor - design age-stratified programs\n")
## • Age is BY FAR the dominant predictor - design age-stratified programs
cat("• App engagement is critical - invest heavily in UX/UI\n")
## • App engagement is critical - invest heavily in UX/UI
cat("• Social support matters - build intentional peer networks\n")
## • Social support matters - build intentional peer networks
cat("• Baseline health (BMI, sleep) enables risk stratification\n")
## • Baseline health (BMI, sleep) enables risk stratification
cat("• Model enables predictive enrollment and early intervention\n")
## • Model enables predictive enrollment and early intervention
cat("\n========================================")
##
## ========================================
cat("\nCLUSTERING ANALYSIS - FINAL SUMMARY")
##
## CLUSTERING ANALYSIS - FINAL SUMMARY
cat("\n========================================\n\n")
##
## ========================================
cat("RECOMMENDED APPROACH: K-means with K=3\n")
## RECOMMENDED APPROACH: K-means with K=3
cat(sprintf("Silhouette Width: %.4f (highest quality)\n\n", mean(sil_kmeans_k3[, 3])))
## Silhouette Width: 0.1511 (highest quality)
cat("EMPLOYEE SEGMENTS:\n\n")
## EMPLOYEE SEGMENTS:
cat("1. MODERATE WELLNESS SEEKERS (n=%d, %.1f%%)\n",
cluster_profiles$Count[1],
100 * cluster_profiles$Count[1] / sum(cluster_profiles$Count))
## 1. MODERATE WELLNESS SEEKERS (n=%d, %.1f%%)
## 531 53.1
cat(" • Profile: Low activity, moderate sleep, low stress\n")
## • Profile: Low activity, moderate sleep, low stress
cat(sprintf(" • Success Rate: %.1f%% (LOWEST)\n", cluster_profiles$Success_Rate[1] * 100))
## • Success Rate: 70.8% (LOWEST)
cat(" • Need: Gradual activation, social integration\n\n")
## • Need: Gradual activation, social integration
cat("2. STRESSED & UNDER-SUPPORTED (n=%d, %.1f%%)\n",
cluster_profiles$Count[2],
100 * cluster_profiles$Count[2] / sum(cluster_profiles$Count))
## 2. STRESSED & UNDER-SUPPORTED (n=%d, %.1f%%)
## 297 29.7
cat(" • Profile: Low activity, POOR sleep, HIGH stress\n")
## • Profile: Low activity, POOR sleep, HIGH stress
cat(sprintf(" • Success Rate: %.1f%% (HIGHEST!) ⭐\n", cluster_profiles$Success_Rate[2] * 100))
## • Success Rate: 74.1% (HIGHEST!) ⭐
cat(" • Need: Stress management, sleep recovery\n\n")
## • Need: Stress management, sleep recovery
cat("3. ACTIVE & AUTONOMOUS (n=%d, %.1f%%)\n",
cluster_profiles$Count[3],
100 * cluster_profiles$Count[3] / sum(cluster_profiles$Count))
## 3. ACTIVE & AUTONOMOUS (n=%d, %.1f%%)
## 172 17.2
cat(" • Profile: HIGH activity (4x others), good sleep, low stress\n")
## • Profile: HIGH activity (4x others), good sleep, low stress
cat(sprintf(" • Success Rate: %.1f%%\n", cluster_profiles$Success_Rate[3] * 100))
## • Success Rate: 73.8%
cat(" • Need: Advanced challenges, peer leadership\n\n")
## • Need: Advanced challenges, peer leadership
cat("CRITICAL INSIGHT:\n")
## CRITICAL INSIGHT:
cat("The most stressed employees (Cluster 2) show the HIGHEST success rate!\n")
## The most stressed employees (Cluster 2) show the HIGHEST success rate!
cat("This reveals that wellness programs work BEST for those who need them most.\n")
## This reveals that wellness programs work BEST for those who need them most.
cat("Market the program as 'stress relief' and 'sleep improvement'.\n")
## Market the program as 'stress relief' and 'sleep improvement'.
cat("\n========================================")
##
## ========================================
cat("\nSTRATEGIC RECOMMENDATIONS")
##
## STRATEGIC RECOMMENDATIONS
cat("\n========================================\n\n")
##
## ========================================
cat("1. PREDICTIVE RISK STRATIFICATION\n")
## 1. PREDICTIVE RISK STRATIFICATION
cat(" • Deploy Random Forest model at program enrollment\n")
## • Deploy Random Forest model at program enrollment
cat(" • Identify high-risk employees using age, app engagement, social support\n")
## • Identify high-risk employees using age, app engagement, social support
cat(" • Provide preemptive support to predicted low-success cases\n\n")
## • Provide preemptive support to predicted low-success cases
cat("2. THREE-TRACK PROGRAM DESIGN\n")
## 2. THREE-TRACK PROGRAM DESIGN
cat(" Track A - 'Wellness Foundations' (Cluster 1, 53%)\n")
## Track A - 'Wellness Foundations' (Cluster 1, 53%)
cat(" → Focus: Activation, habits, social connection\n")
## → Focus: Activation, habits, social connection
cat(" → Intensity: Medium\n\n")
## → Intensity: Medium
cat(" Track B - 'Stress & Recovery' (Cluster 2, 30%)\n")
## Track B - 'Stress & Recovery' (Cluster 2, 30%)
cat(" → Focus: Stress management, sleep improvement\n")
## → Focus: Stress management, sleep improvement
cat(" → Intensity: High (clinical resources)\n\n")
## → Intensity: High (clinical resources)
cat(" Track C - 'Performance Excellence' (Cluster 3, 17%)\n")
## Track C - 'Performance Excellence' (Cluster 3, 17%)
cat(" → Focus: Advanced challenges, leadership\n")
## → Focus: Advanced challenges, leadership
cat(" → Intensity: Low (leverage as mentors)\n\n")
## → Intensity: Low (leverage as mentors)
cat("3. EARLY INTERVENTION TRIGGERS (Weeks 1-2)\n")
## 3. EARLY INTERVENTION TRIGGERS (Weeks 1-2)
cat(" • Monitor app engagement closely (2nd most important predictor)\n")
## • Monitor app engagement closely (2nd most important predictor)
cat(" • Age-stratified outreach (most important predictor)\n")
## • Age-stratified outreach (most important predictor)
cat(" • Immediate social support connection (3rd most important)\n\n")
## • Immediate social support connection (3rd most important)
cat("4. LEVERAGE HIGH-PERFORMERS\n")
## 4. LEVERAGE HIGH-PERFORMERS
cat(" • Use Cluster 3 as peer mentors for Cluster 1\n")
## • Use Cluster 3 as peer mentors for Cluster 1
cat(" • Cross-cluster buddy systems\n")
## • Cross-cluster buddy systems
cat(" • Leadership roles in group activities\n\n")
## • Leadership roles in group activities
cat("5. ADDRESS THE PARADOX (Cluster 2 Opportunity)\n")
## 5. ADDRESS THE PARADOX (Cluster 2 Opportunity)
cat(" • Marketing: Lead with stress relief and sleep improvement\n")
## • Marketing: Lead with stress relief and sleep improvement
cat(" • Clinical integration: Partner with EAP and mental health\n")
## • Clinical integration: Partner with EAP and mental health
cat(" • Resource allocation: 30% of workforce, highest ROI potential\n\n")
## • Resource allocation: 30% of workforce, highest ROI potential
cat("6. CONTINUOUS MONITORING\n")
## 6. CONTINUOUS MONITORING
cat(" • Track cluster-specific KPIs\n")
## • Track cluster-specific KPIs
cat(" • Quarterly re-clustering to detect profile changes\n")
## • Quarterly re-clustering to detect profile changes
cat(" • A/B test interventions within each cluster\n")
## • A/B test interventions within each cluster
sessionInfo()
## R version 4.4.3 (2025-02-28)
## Platform: aarch64-apple-darwin20
## Running under: macOS 26.2
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/Phoenix
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ROCR_1.0-12 pROC_1.19.0.1 patchwork_1.3.2
## [4] factoextra_1.0.7 gridExtra_2.3 corrplot_0.95
## [7] GGally_2.4.0 cluster_2.1.8 e1071_1.7-17
## [10] randomForest_4.7-1.2 caret_7.0-1 lattice_0.22-6
## [13] data.table_1.18.2.1 lubridate_1.9.4 forcats_1.0.1
## [16] stringr_1.6.0 dplyr_1.1.4 purrr_1.2.1
## [19] readr_2.1.6 tidyr_1.3.2 tibble_3.3.1
## [22] ggplot2_4.0.1 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.2.1 timeDate_4052.112 farver_2.1.2
## [4] S7_0.2.1 fastmap_1.2.0 digest_0.6.39
## [7] rpart_4.1.24 timechange_0.3.0 lifecycle_1.0.5
## [10] survival_3.8-3 magrittr_2.0.4 compiler_4.4.3
## [13] rlang_1.1.7 sass_0.4.10 tools_4.4.3
## [16] utf8_1.2.6 yaml_2.3.12 knitr_1.51
## [19] labeling_0.4.3 plyr_1.8.9 RColorBrewer_1.1-3
## [22] withr_3.0.2 nnet_7.3-20 grid_4.4.3
## [25] stats4_4.4.3 future_1.69.0 globals_0.18.0
## [28] scales_1.4.0 iterators_1.0.14 MASS_7.3-64
## [31] cli_3.6.5 rmarkdown_2.30 generics_0.1.4
## [34] otel_0.2.0 rstudioapi_0.18.0 future.apply_1.20.1
## [37] reshape2_1.4.5 tzdb_0.5.0 cachem_1.1.0
## [40] proxy_0.4-29 splines_4.4.3 parallel_4.4.3
## [43] vctrs_0.7.1 hardhat_1.4.2 Matrix_1.7-2
## [46] jsonlite_2.0.0 hms_1.1.4 ggrepel_0.9.6
## [49] listenv_0.10.0 foreach_1.5.2 gower_1.0.2
## [52] jquerylib_0.1.4 recipes_1.3.1 glue_1.8.0
## [55] parallelly_1.46.1 ggstats_0.12.0 codetools_0.2-20
## [58] stringi_1.8.7 gtable_0.3.6 pillar_1.11.1
## [61] htmltools_0.5.9 ipred_0.9-15 lava_1.8.2
## [64] R6_2.6.1 evaluate_1.0.5 bslib_0.10.0
## [67] class_7.3-23 Rcpp_1.1.1 nlme_3.1-167
## [70] prodlim_2025.04.28 xfun_0.56 pkgconfig_2.0.3
## [73] ModelMetrics_1.2.2.2